home *** CD-ROM | disk | FTP | other *** search
- /* Parse HyperText Document Address HTParse.c
- ** ================================
- */
-
- #include"capalloc.h"
- #include"capstdio.h"
- #include "HTUtils.h"
- #include "HTParse.h"
- #include "tcp.h"
-
- #define HEX_ESCAPE '%'
-
- struct struct_parts {
- char * access;
- char * host;
- char * absolute;
- char * relative;
- /* char * search; no - treated as part of path */
- char * anchor;
- };
-
-
- /* Strip white space off a string
- ** ------------------------------
- **
- ** On exit,
- ** Return value points to first non-white character, or to 0 if none.
- ** All trailing white space is OVERWRITTEN with zero.
- */
-
- #ifdef __STDC__
- char * HTStrip(char * s)
- #else
- char * HTStrip(s)
- char *s;
- #endif
- {
- #define SPACE(c) ((c==' ')||(c=='\t')||(c=='\n'))
- char * p=s;
- for(p=s;*p;p++); /* Find end of string */
- for(p--;p>=s;p--) {
- if(SPACE(*p)) *p=0; /* Zap trailing blanks */
- else break;
- }
- while(SPACE(*s))s++; /* Strip leading blanks */
- return s;
- }
-
-
- static void scan(char *cp_name, struct struct_parts *SSPp_parts) {
- /*
- * Purpose: Break up an address name into its separate parts.
- * Arguments: cp_name An address (URL) to break up.
- * The name may be incomplete.
- * SSBp_parts The structure to store the different
- * parts in.
- * Return Value: void
- * Remarks/Portability/Dependencies/Restrictions:
- * The following refers to the members of the passed in
- * structure upon return of this function:
- * The absolute xor relative are NULL.
- * host, anchor, and access may be nonzero if they
- * were found in the address.
- * Any nonzero members point to ASCIIZ strings.
- * Revision History:
- * ??-??-?? created
- * 03-28-94 modified for DosLynx
- */
- auto char *cp_after_access;
- auto char *cp_p;
- auto signed short int ssi_length = strlen(cp_name);
-
- /*
- * Initialize all parts of the upcoming parts of the name.
- */
- SSPp_parts->access = SSPp_parts->host = SSPp_parts->absolute =
- SSPp_parts->relative = SSPp_parts->anchor = NULL;
-
- /*
- * Save a pointer to the start of the original name.
- * This is a reference to where the access of the address
- * has ended (i.e. http, ftp, etc...)
- * There may be no access specified.
- */
- cp_after_access = cp_name;
-
- /*
- * Loop through the address to set the access name and track
- * where it ends.
- */
- for(cp_p = cp_name; *cp_p != '\0'; cp_p++) {
- /*
- * We loop for a colon which always follows the access
- * name.
- * (Except in the case of a specified port number???)
- */
- switch(*cp_p) {
- /*
- * Need to break the loop on the following
- * special characters that make up a URL.
- */
- case '/':
- case '#':
- case '.':
- break;
- case ':':
- /*
- * End the string here. No need to keep :
- * Set the access part of the structure.
- * Set to where the string continues after the
- * access string.
- */
- *cp_p = '\0';
- SSPp_parts->access = cp_name;
- cp_after_access = cp_p + 1;
- break;
- default:
- continue;
- }
-
- /*
- * If code gets here, need to break loop.
- */
- break;
- }
-
- /*
- * Loop backwards through the address looking for the tag
- * anchor to first select upon loading.
- */
- for(cp_p = cp_name + ssi_length - 1; cp_p >= cp_name; cp_p--) {
- /*
- * Found the tag anchor, terminate the address before
- * the #, the tag anchor should be the rest of the
- * address to the end of the address.
- */
- if(*cp_p == '#') {
- SSPp_parts->anchor = cp_p + 1;
- *cp_p = '\0';
- }
- }
-
- /*
- * Start back up directly after the specified access type.
- */
- cp_p = cp_after_access;
-
- /*
- * If there we have a /, a host or root should follow.
- */
- if(*cp_p == '/') {
- /*
- * If following the /, we have another /, there is a
- * host following.
- */
- if(*(cp_p + 1) == '/') {
- /*
- * Set the address pointing to the host.
- */
- SSPp_parts->host = cp_p + 2;
-
- /*
- * Attempt to find the end of the host's name
- * beginning, of course, with a path /.
- */
- cp_p = strchr(SSPp_parts->host, '/');
-
- /*
- * A path (root) was found, set the absolute
- * path with it.
- * Be sure to terminate the host name.
- */
- if(cp_p != NULL) {
- *cp_p = '\0';
- SSPp_parts->absolute = cp_p + 1;
- }
- }
- else {
- /*
- * There was no host specified, must use what
- * follows as the absolute path (root).
- */
- SSPp_parts->absolute = cp_p + 1;
- }
- }
- else {
- /*
- * There is no host or root (path) specification in
- * the address so it must be relative.
- * Be careful not to assign an unNULL string if there
- * is actually nothing inside of it.
- */
- SSPp_parts->relative = (*cp_after_access) ?
- cp_after_access : NULL;
- }
-
- /*
- * If there was an access type and an anchor specification
- * but no host, this is an exception.
- * We must restore the tag anchor symbol # to the address
- * and set that there is actually no anchor.
- * In these cases, the anchor is not really an anchor at all.
- * e.g. news:j462#36487@foo.bar
- */
- if(SSPp_parts->access != NULL && SSPp_parts->host == NULL &&
- SSPp_parts->anchor != NULL) {
- *(SSPp_parts->anchor - 1) = '#';
- SSPp_parts->anchor = NULL;
- }
-
- /*
- * All done with the scan.
- */
- }
-
-
- extern char *HTParse(const char *cp_aName, const char *cp_relatedName,
- signed short int ssi_wanted) {
- /*
- * Purpose: Parse an address (URL) name relative to another
- * (URL) name.
- * Arguments: cp_aName The address to parse.
- * cp_relatedName The relative address to parse
- * cp_aName with.
- * ssi_wanted A mask for the bits which are
- * flags on how to parse the address.
- * Return Value: char * A malloced string which is the resulting
- * address requested according to the flags
- * which were set and the original and
- * and relative name.
- * Remarks/Portability/Dependencies/Restrictions:
- * All calling functions should free the memory allocated by
- * HTParse once finished with the return value.
- * Revision History:
- * ??-??-?? created
- * 03-28-94 modified for DosLynx
- */
-
- auto char *cp_result;
- auto char *cp_return_value = NULL;
- auto signed short int ssi_len;
- auto char *cp_name = NULL;
- auto char *cp_rel = NULL;
- auto char *cp_p;
- auto char *cp_access;
- auto struct struct_parts SSP_given, SSP_related;
-
- /*
- * Copy the input strings so that we can split them up into
- * their parts.
- */
- ssi_len = strlen(cp_aName) + strlen(cp_relatedName) + 10;
- /*
- * Allocate space; more than enough.
- */
- cp_result = (char *)malloc(ssi_len);
- /*
- * Report error on not enough memory to allocate.
- */
- if(cp_result == NULL) {
- outofmem(__FILE__, "HTParse");
- }
-
- /*
- * Copy over the two names, allocating memory while doing so.
- * Question: How does StrAllocCopy change the pointer value?
- * Answer: Must be a macro, or this is a bug.
- * It's a macro in HTString, calls HTSACopy
- */
- StrAllocCopy(cp_name, cp_aName);
- StrAllocCopy(cp_rel, cp_relatedName);
-
- /*
- * Break the allocated names up into their respective parts.
- */
- scan(cp_name, &SSP_given);
- scan(cp_rel, &SSP_related);
-
- /*
- * Begin building the requested address.
- */
- *cp_result = '\0';
-
- cp_access = SSP_given.access ? SSP_given.access :
- SSP_related.access;
-
- /*
- * Requesting the access, if there is any, be sent back.
- */
- if(ssi_wanted & PARSE_ACCESS) {
- if(cp_access) {
- strcat(cp_result, cp_access);
- /*
- * Requestor also wants full URL style
- * return. Put in a : to separate access.
- */
- if(ssi_wanted & PARSE_PUNCTUATION) {
- strcat(cp_result, ":");
- }
- }
- }
-
- /*
- * If the access is not specified on either the given address
- * or related address and then also if they are not the same,
- * then disregard all related information.
- */
- if (SSP_given.access && SSP_related.access) {
- if(strcmp(SSP_given.access, SSP_related.access) != 0) {
- SSP_related.host =
- SSP_related.absolute =
- SSP_related.relative =
- SSP_related.anchor = NULL;
- }
- }
-
- /*
- * If requesting the host in the return.
- */
- if(ssi_wanted & PARSE_HOST) {
- /*
- * If either the given or related address has a host
- */
- if(SSP_given.host != NULL || SSP_related.host != NULL) {
- /*
- * Figure where to add the host.
- */
- auto char *cp_tail = cp_result + strlen(cp_result);
- /*
- * If exact URL punctuation requested, add the
- * leading //
- */
- if(ssi_wanted & PARSE_PUNCTUATION) {
- strcat(cp_result, "//");
- }
- /*
- * Append the host
- */
- strcat(cp_result, SSP_given.host != NULL ?
- SSP_given.host : SSP_related.host);
-
- /*
- * We must ignore default port numbers and
- * trailing dots on FQDNs(?) which will cause
- * identical addresses to look different.
- */
- {
- /*
- * Find a : in the host.
- */
- auto char *cp = strchr(cp_tail, ':');
- /*
- * If a port was specified.
- */
- if(cp != NULL && cp_access != NULL) {
- /*
- * Check for redundant access
- * types and port numbers.
- */
- if((strcmp(cp_access, "http") == 0
- && strcmp(cp, ":80") == 0)
- || (strcmp(cp_access,
- "gopher") == 0 && strcmp(cp,
- ":70") == 0)) {
- /*
- * Redundant, end the
- * return address
- * before the port is
- * specified.
- */
- *cp = '\0';
- }
- }
- /*
- * No redundant port specified.
- */
- else if(cp == NULL) {
- /*
- * Set to end of hostname.
- */
- cp = cp_tail + strlen(cp_tail);
- }
-
- /*
- * Back up one since beyond actual
- * end of the hostname.
- */
- cp--;
-
- /*
- * If there is a period at the end of
- * the hostname, kill it.
- */
- if(*cp == '.') {
- *cp = '\0';
- }
- }
- }
- }
-
- /*
- * If there are different hosts, no relative path will be
- * assumed.
- */
- if(SSP_given.host != NULL && SSP_related.host != NULL) {
- if(strcmp(SSP_given.host, SSP_related.host) != 0) {
- SSP_related.absolute =
- SSP_related.relative =
- SSP_related.anchor = NULL;
- }
- }
-
- /*
- * If the path is also part of the requested return.
- */
- if(ssi_wanted & PARSE_PATH) {
- /*
- * If the absolute (full) path is already given.
- */
- if(SSP_given.absolute != NULL) {
- /*
- * Requesting the full URL punctuation
- */
- if(ssi_wanted & PARSE_PUNCTUATION) {
- strcat(cp_result, "/");
- }
- /*
- * Append the absolute path.
- */
- strcat(cp_result, SSP_given.absolute);
- }
- /*
- * Otherwise, we must adopt the given path but not
- * the file name.
- */
- else if(SSP_related.absolute != NULL) {
- /*
- * Append the leading /
- * Shouldn't we check for PARSE_PUNCTUATION?
- * Doing so, possible error.
- * Append the relative absolute path.
- */
- if(ssi_wanted & PARSE_PUNCTUATION) {
- strcat(cp_result, "/");
- }
- strcat(cp_result, SSP_related.absolute);
-
- /*
- * Check to see if we have a relative path
- * to further evaluate and append.
- */
- if(SSP_given.relative != NULL) {
- /*
- * See if there is a search directive
- * in the address, if so avoid it.
- * If there isn't set to the end of
- * the address.
- */
- cp_p = strchr(cp_result, '?');
- if(cp_p == NULL) {
- cp_p = cp_result + strlen(cp_result)
- - 1;
- }
-
- /*
- * Find the last / by backing up and
- * finding it.
- */
- for(; *cp_p != '/'; cp_p--)
- /* NULL body */;
-
- /*
- * Remove the file name from the
- * address and add the given relative
- * path and file.
- */
- *(cp_p + 1) = '\0';
- strcat(cp_result, SSP_given.relative);
-
- /*
- * Simplyfy the resulting address by
- * taking out .. and . stuff
- */
- HTSimplify(cp_result);
- }
- }
- /*
- * Otherwise we use what we have got.
- */
- else if(SSP_given.relative != NULL) {
- strcat(cp_result, SSP_given.relative);
- }
- else if(SSP_related.relative != NULL) {
- strcat(cp_result, SSP_related.relative);
- }
- else {
- /*
- * No inheritance at all.
- */
- strcat(cp_result, "/");
- }
- }
-
- /*
- * If the anchor is requested also.
- */
- if(ssi_wanted & PARSE_ANCHOR) {
- if(SSP_given.anchor != NULL || SSP_related.anchor != NULL)
- {
- /*
- * Keep punctuation if requested.
- */
- if(ssi_wanted & PARSE_PUNCTUATION) {
- strcat(cp_result, "#");
- }
- strcat(cp_result, SSP_given.anchor != NULL ?
- SSP_given.anchor : SSP_related.anchor);
- }
- }
-
- /*
- * Free up the copied anchors.
- * This also frees the memory pointed to by our SSP_* structs
- */
- free(cp_rel);
- free(cp_name);
-
- /*
- * Allocate a new string that will be the correct length.
- */
- StrAllocCopy(cp_return_value, cp_result);
- free(cp_result);
- return(cp_return_value);
- }
-
-
- /* Simplify a filename
- // -------------------
- //
- // A unix-style file is allowed to contain the seqeunce xxx/../ which may be
- // replaced by "" , and the seqeunce "/./" which may be replaced by "/".
- // Simplification helps us recognize duplicate filenames.
- //
- // Thus, /etc/junk/../fred becomes /etc/fred
- // /etc/junk/./fred becomes /etc/junk/fred
- //
- // but we should NOT change
- // http://fred.xxx.edu/../..
- //
- // or ../../albert.html
- */
- #ifdef __STDC__
- void HTSimplify(char * filename)
- #else
- void HTSimplify(filename)
- char * filename;
- #endif
-
- {
- char * p;
- char * q;
- if (filename[0] && filename[1]) /* Bug fix 12 Mar 93 TBL */
- for(p=filename+2; *p; p++) {
- if (*p=='/') {
- if ((p[1]=='.') && (p[2]=='.') && (p[3]=='/' || !p[3] )) {
- for (q=p-1; (q>=filename) && (*q!='/'); q--); /* prev slash */
- if (q[0]=='/' && 0!=strncmp(q, "/../", 4)
- &&!(q-1>filename && q[-1]=='/')) {
- strcpy(q, p+3); /* Remove /xxx/.. */
- if (!*filename) strcpy(filename, "/");
- p = q-1; /* Start again with prev slash */
- } else { /* xxx/.. leave it! */
- #ifdef BUG_CODE
- strcpy(filename, p[3] ? p+4 : p+3); /* rm xxx/../ */
- p = filename; /* Start again */
- #endif
- }
- } else if ((p[1]=='.') && (p[2]=='/' || !p[2])) {
- strcpy(p, p+2); /* Remove a slash and a dot */
- }
- }
- }
- }
-
-
- /* Make Relative Name
- ** ------------------
- **
- ** This function creates and returns a string which gives an expression of
- ** one address as related to another. Where there is no relation, an absolute
- ** address is retured.
- **
- ** On entry,
- ** Both names must be absolute, fully qualified names of nodes
- ** (no anchor bits)
- **
- ** On exit,
- ** The return result points to a newly allocated name which, if
- ** parsed by HTParse relative to relatedName, will yield aName.
- ** The caller is responsible for freeing the resulting name later.
- **
- */
- #ifdef __STDC__
- char * HTRelative(const char * aName, const char *relatedName)
- #else
- char * HTRelative(aName, relatedName)
- char * aName;
- char * relatedName;
- #endif
- {
- char * result = 0;
- CONST char *p = aName;
- CONST char *q = relatedName;
- CONST char * after_access = 0;
- CONST char * path = 0;
- CONST char * last_slash = 0;
- int slashes = 0;
-
- for(;*p; p++, q++) { /* Find extent of match */
- if (*p!=*q) break;
- if (*p==':') after_access = p+1;
- if (*p=='/') {
- last_slash = p;
- slashes++;
- if (slashes==3) path=p;
- }
- }
-
- /* q, p point to the first non-matching character or zero */
-
- if (!after_access) { /* Different access */
- StrAllocCopy(result, aName);
- } else if (slashes<3){ /* Different nodes */
- StrAllocCopy(result, after_access);
- } else if (slashes==3){ /* Same node, different path */
- StrAllocCopy(result, path);
- } else { /* Some path in common */
- int levels= 0;
- for(; *q && (*q!='#'); q++) if (*q=='/') levels++;
- result = (char *)malloc(3*levels + strlen(last_slash) + 1);
- if (result == NULL) outofmem(__FILE__, "HTRelative");
- result[0]=0;
- for(;levels; levels--)strcat(result, "../");
- strcat(result, last_slash+1);
- }
- #ifndef RELEASE
- if (TRACE) fprintf(stderr, "HT: `%s' expressed relative to\n `%s' is\n `%s'.",
- aName, relatedName, result);
- #endif /* RELEASE */
- return result;
- }
-
-
- /* Escape undesirable characters using % HTEscape()
- ** -------------------------------------
- **
- ** This function takes a pointer to a string in which
- ** some characters may be unacceptable unescaped.
- ** It returns a string which has these characters
- ** represented by a '%' character followed by two hex digits.
- **
- ** Unlike HTUnEscape(), this routine returns a malloced string.
- */
-
- PRIVATE CONST unsigned char isAcceptable[96] =
-
- /* Bit 0 xalpha -- see HTFile.h
- ** Bit 1 xpalpha -- as xalpha but with plus.
- ** Bit 3 ... path -- as xpalphas but with /
- */
- /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
- { 0,0,0,0,0,0,0,0,0,0,7,6,0,7,7,4, /* 2x !"#$%&'()*+,-./ */
- 7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,0, /* 3x 0123456789:;<=>? */
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 4x @ABCDEFGHIJKLMNO */
- 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,7, /* 5X PQRSTUVWXYZ[\]^_ */
- 0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 6x `abcdefghijklmno */
- 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,0 }; /* 7X pqrstuvwxyz{\}~ DEL */
-
- PRIVATE char *hex = "0123456789ABCDEF";
-
- PUBLIC char * HTEscape ARGS2 (CONST char *, str,
- unsigned char, mask)
- {
- #define ACCEPTABLE(a) ( a>=32 && a<128 && ((isAcceptable[a-32]) & mask))
- CONST char * p;
- char * q;
- char * result;
- int unacceptable = 0;
- for(p=str; *p; p++)
- if (!ACCEPTABLE((unsigned char)TOASCII(*p)))
- unacceptable++;
- result = (char *) malloc(p-str + unacceptable+ unacceptable + 1);
- if (result == NULL) outofmem(__FILE__, "HTEscape");
- for(q=result, p=str; *p; p++) {
- unsigned char a = TOASCII(*p);
- if (!ACCEPTABLE(a)) {
- *q++ = HEX_ESCAPE; /* Means hex commming */
- *q++ = hex[a >> 4];
- *q++ = hex[a & 15];
- }
- else *q++ = *p;
- }
- *q++ = 0; /* Terminate */
- return result;
- }
-
-
- /* Decode %xx escaped characters HTUnEscape()
- ** -----------------------------
- **
- ** This function takes a pointer to a string in which some
- ** characters may have been encoded in %xy form, where xy is
- ** the acsii hex code for character 16x+y.
- ** The string is converted in place, as it will never grow.
- */
-
- PRIVATE char from_hex ARGS1(char, c)
- {
- return c >= '0' && c <= '9' ? c - '0'
- : c >= 'A' && c <= 'F'? c - 'A' + 10
- : c - 'a' + 10; /* accept small letters just in case */
- }
-
- PUBLIC char * HTUnEscape ARGS1( char *, str)
- {
- char * p = str;
- char * q = str;
- while(*p) {
- if (*p == HEX_ESCAPE) {
- p++;
- if (*p) *q = from_hex(*p++) * 16;
- if (*p) *q = FROMASCII(*q + from_hex(*p++));
- q++;
- } else {
- *q++ = *p++;
- }
- }
-
- *q++ = 0;
- return str;
-
- } /* HTUnEscape */
-
-
-